<?php

namespace Kangcg\Reptile;

use mysql_xdevapi\Exception;
use \phpQuery;

class Amazon
{
    public $proxy = null; //127.0.0.1:8787', 代理IP
    const IMAGE_DOWNLOAD = 'image_download'; //文件下载事件
    const TRY_TRIGGER_CODE = 'try_trigger_code'; //触发验证事件
    const REQUEST_ITEM_PAGE_SUCCESS = 'item_page_success'; //解析商品页面成功
    const REQUEST_ITEM_PAGE_FAIL = 'item_page_fail'; //解析商品页面失败

    const REQUEST_LIST_PAGE_SUCCESS = 'list_page_success'; //列表页面的解析
    const REQUEST_LIST_PAGE_FAIL = 'list_page_fail'; //列表页面的解析

    private $urls = [
        'https://www.amazon.cn/', //中国站点
        'https://www.amazon.fr/', //法国站点
        'https://www.amazon.it/', //意大利站点
        'https://www.amazon.es/', //西班牙站点
        'https://www.amazon.ca/', //加拿大站点
        'https://www.amazon.in/', //印度站点
        'https://www.amazon.com.br/', //巴西站点
        'https://www.amazon.com.mx/', //墨西哥站点
        'https://www.amazon.com.au/', //澳大利亚站点
        'https://www.amazon.com.tr/', //土耳其站点
        'https://www.amazon.co.uk/', //英国站点
        'https://www.amazon.co.de/', //德国站点
        'https://www.amazon.co.jp/', //日本站点
    ];

    /**
     * AmazonReptile constructor.
     * @param string $filePath 文件临时存储地址
     * @param string $imgPath 图片本地保存地址
     * @param int $waitCount 拉取多少个后进行等待
     * @param int $waitTime 等待时间，触发时等待， 每次拉取 $waitCount 后等待。
     * @param int $retry 错误重试次数
     * @param int|string $url 派去站点
     */
    public function __construct($filePath, $imgPath, int $waitCount = 200, $waitTime = 1800, $retry = 3, $url = 0, int $pageWait = 3)
    {
        $this->filePath = realpath($filePath);
        $this->imgPath = $imgPath;
        $this->waitCount = $waitCount;
        $this->waitTime = $waitTime;
        $this->pageWait = $pageWait > 0 ? $pageWait : 1;
        $this->retry = $retry;
        $this->baseUrl = ($this->urls[$url] ?? $url);
        $this->__init();
    }

    public function run()
    {

    }

    /**
     * 某个科目商品列表页面
     * @param array $amazon [
     * 'i' => '',
     * 'bbn' => '',
     * 'rh' => '',
     * 'node' => '',
     * 'page' => '',
     * ]
     * @param null $totalPage
     * @return mixed
     * @throws \Exception
     */
    public function requestListPage(array $params, $totalPage = null)
    {
        $url = $this->baseUrl . 's?';
        $params = array_filter($params);
        $params['page'] += 1;
        if ($params['page'] == 1) {
            $params['fs'] = 'true';
        }

        $params['code'] = 'success';
        $url .= http_build_query($params);
        $content = $this->requestUrl($url, 0, $filename);
        $params = $this->parserListPage($content, $params);
        $name = $params['code'] == 'success' ? self::REQUEST_LIST_PAGE_SUCCESS : self::REQUEST_LIST_PAGE_FAIL;
        unset($params['code']);
        if ($this->executeEvent($name, $params) === false) {
            return;
        }

        if ($totalPage == null) {
            $phpQueryObject = \phpQuery::newDocumentFile($filename);
            $totalPage = trim(\phpQuery::pq(".s-pagination-strip span:last", $phpQueryObject)->text());
        }

        $this->removeFiles($filename);
        if ($params['page'] < $totalPage) {
            return $this->requestListPage($params, $totalPage);
        }

        return true;
    }

    public function requestItemPage($url)
    {
        ;
        try {
            if (!$content = $this->requestUrl($url, 0, $filePath)) {
                return;
            }

            $goods = $this->parseItemPage($content, $filePath);
            if (empty($goods)) {
                return;
            }

            $base['name'] = $goods['image'];
            $base['image_main'] = $goods['store_name'];
            $base['category_name'] = $goods['category'];
            $base['short_info'] = $goods['store_info'];
            $base['description'] = $goods['description'];
            $base['technology'] = $goods['technology'];
            $base['price'] = $goods['price'];
            $base['tmp'][] = $filePath;
            //单规格数据
            if (empty($goods['attrs'])) {
                $base['price'] = $goods['price'];
                $base['image_slider'] = $goods['slider_image'];
                return $base;
            }

            $base['attr'] = [];
            $base['attr_name'] = $goods['attrNames'];
            $attrs = $goods['attrs'];
            $attrName = $goods['attrNames'];
            $base['goods'] = [];
            $price = 99999999;
            foreach ($attrs as $id => $attr) {
                $item['id'] = $id;
                foreach ($attr as $key => $val) {
                    $val = trim($val);
                    $name = trim($attrName[$key]);
                    $item['attr'][$name] = $val;
                    $base['attr'][$name][] = $val;
                }

                $item['url'] = $url = $this->baseUrl . "dp/{$id}?psc=1";
                if (!$content = $this->requestUrl($url, 0, $filename)) {
                    break;
                }

                $base['tmp'][] = $filename;
                //当目前无货，无法抓取跳过！
                if (!$info = $this->parseItemPage($content, $filename)) {
                    continue;
                }

                $item['image_main'] = $info['image'];
                $item['image_slider'] = $info['slider_image'];
                $item['price'] = $info['price']; //售价
                $base['image_slider'][] = $item['image_main'];
                if ($item['price'] > 0 && $item['price'] < $price) {
                    $price = $info['price'];
                }

                $base['price'] = $price;
                $base['goods'][] = $item;
            }

            return $base;
        } catch (\Error $error) {
            return false;
        }

        return true;
    }

    //解析详情也并且抓取详情页数据
    private function parseItemPage($content, $filePath)
    {
        if (!$info = $this->parseItemPageByParsePageDoc($filePath)) {
            return false;
        }

        $preg = "/dimensionValuesDisplayData[ \"':].*?(?<json>{.*?})/is";
        preg_match_all($preg, $content, $matches);
        if (count($matches['json']) > 0) {
            $info['attrs'] = json_decode(trim($matches['json'][0]), true); //规格列表
            $preg = "/dimensionsDisplay[ \"':].*?(?<json>\[.*?\])/is";
            preg_match_all($preg, $content, $matches);
            $info['attrNames'] = json_decode(trim($matches['json'][0]), true); //规格名称
        }

        $preg = "/jQuery.parseJSON\([ \"'].*?(?<json>{.*?})['\")].*?/is";
        preg_match_all($preg, $content, $matches);
        $pro = json_decode(trim($matches['json'][0]), true);
        $info['store_name'] = trim($pro['title']); //品名
        if (isset($pro['colorImages'][$pro['landingAsinColor']])) {
            $imgs = $pro['colorImages'][$pro['landingAsinColor']];
            foreach ($imgs as $img) {
                $info['slider_image'][] = $img['large'];
            }
        } else {
            $preg = "/'colorImages': { 'initial': (?<json>\[[\s\S].*?}])/is";
            preg_match_all($preg, $content, $matches);
            if (count($matches['json'])) {
                $data = json_decode($matches['json'][0], true);
                foreach ($data as $item) {
                    $info['slider_image'][] = $item['large'];
                }
            }
        }

        foreach ($info['slider_image'] as $key => $imgUrl) {
            $info['slider_image'][$key] = $this->executeEvent(self::IMAGE_DOWNLOAD, $imgUrl, $info['store_name']);
        }

        $info['image'] = $info['slider_image'][0] ?? "/favicon.ico";
        return $info;
    }

    //获取产品信息
    private function parseItemPageByParsePageDoc($fileName)
    {
        $phpQueryObject = \phpQuery::newDocumentFile($fileName);
        $res = \phpQuery::pq("#wayfinding-breadcrumbs_feature_div  .a-list-item", $phpQueryObject);
        $info = [];
        $res->each(function ($element) use (&$info) {
            $name = trim($element->nodeValue);
            if ($name == '›') {
                return;
            }

            $info['category'][] = trim($name, '"');
        });

        if (empty($info['category'])) {
            $info['category'][] = "服装";
        }

        $info['store_info'] = \phpQuery::pq("#detailBullets_feature_div", $phpQueryObject)->text(); //基本信息
        $info['description'] = \phpQuery::pq("#productDescription", $phpQueryObject)->html(); //介绍
        $info['technology'] = \phpQuery::pq("#prodDetails", $phpQueryObject)->html(); //技术信息
        $info['price'] = \phpQuery::pq("#corePrice_feature_div .a-price .a-offscreen", $phpQueryObject)->text(); //价格
        $info['attrs_show'] = \phpQuery::pq("#feature-bullets li", $phpQueryObject)->text(); //产品参数
        $info['but_txt'] = \phpQuery::pq("#partialStateBuybox span:first")->text();
        if ($info['price']) {
            $info['price'] = trim($info['price'], '¥');
        }

        if (!$info['price'] && !$info['but_txt'] && \phpQuery::pq("#availability")->text()) {
            return false;
        }

        return $info;
    }

    /**
     * 分析某个科目商品列表页面
     * @param $content
     * @param $amazon
     * @return int|string
     */
    public function parserListPage($content, array $params)
    {
        //$preg = "/target=['\"]_blank['\"][ ]?href=['\"](?<url>[\s\S].*?)['\"]/i";
        $preg = "/sg-col-4-of-24 sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col s-widget-spacing-small sg-col-4-of-20[\s\S].*?<img[\s\S].*?src=['\"](?<img>[\s\S].*?)['\"][\s\S].*?class=['\"]a-size-base-plus a-color-base['\"]>(?<icon>[\s\S].*?)<\/span>[\s\S].*?target=['\"]_blank['\"][ ]?href=['\"](?<url>[\s\S].*?)['\"][\s\S].*a-size-base-plus a-color-base a-text-normal['\"]>(?<name>[\s\S].*?)<[\s\S].*<span class=['\"]a-offscreen['\"]>¥(?<price>[\s\S].*?)</i";
        preg_match_all($preg, $content, $matches);
        $url = $this->baseUrl;
        foreach ($matches['url'] as $key => $href) {
            if ($params['count'] >= $key) {
                continue;
            }

            $params['count'] = $key;
            $url = $url . $this->parseUrl($href);
            $goods = $this->requestItemPage($url);
            if (empty($goods)) {
                if ($this->executeEvent(self::REQUEST_ITEM_PAGE_FAIL, $url, $params) === false) {
                    $params['code'] = 'fail';
                    break;
                }

                continue;
            }

            $tmp = $goods['tmp'];
            unset($goods['tmp']);
            if ($this->executeEvent(self::REQUEST_ITEM_PAGE_SUCCESS, $goods, $url, $params) === false) {
                break;
            }

            $this->removeFiles($tmp);
        }

        return $params;
    }

    public function requestUrl($url, $retry = 0, &$filePath = null)
    {
        if ($retry > $this->retry) {
            return null;
        }

        $filePath = $this->getTmpFilePath($url);
        if (file_exists($filePath)) {
            return $this->getLocalFile($filePath);
        }

        $retry++;
        sleep($this->pageWait);

        $headers[] = 'Accept-Language: zh-CN,zh;q=0.9';
        $headers[] = 'Connection: keep-alive';
        $headers[] = 'Content-Type: text/plain;charset=UTF-8';
        $headers[] = 'Sec-Ch-Ua:"Google Chrome";v="113", "Chromium";v="113", "Not-A.Brand";v="24"';
        $headers[] = 'Sec-Ch-Ua-Mobile: ?0';
        $headers[] = 'Sec-Ch-Ua-Platform: "Windows"';
        $headers[] = 'Sec-Fetch-Dest: "document"';
        $headers[] = 'Sec-Fetch-Mode: navigate';
        $headers[] = 'Sec-Fetch-Site: same-origin';
        $headers[] = 'Sec-Fetch-User: ?1';
        $headers[] = 'Upgrade-Insecure-Requests: 1';
        $headers[] = 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36';
        $Cookie = "csm-sid=076-4510479-3940246; x-amz-captcha-1=1685590746425634; x-amz-captcha-2=XfFXTN5kymDzJSe+G8vVjQ==; session-id=461-7187116-7753717; session-id-time=2082787201l; i18n-prefs=CNY; ubid-acbcn=462-5560784-7766305; session-token=\"GnZu0GNWNkorT1pzl3fi1bDeazgZieNe8YIJBGu+JgiDUD//0iRpPlIqHTZ9Q0BhWrzzWrJiKHMpfUZoz7BzChMGwzTIbQa6yR1idLAagiYV+BEZwuWqv5MR5NvxKMpcjyCFLu5Y78w/O3g6bVs0DzMVb4b+tYD2ooG65NHh+XrDpHH7QC28cScUSgIkCXh9x2jqEvgpVXhC/Ue2cVpnC/VRqRUQ2wajKzPBzkrv1ok=\"; csm-hit=tb:SP9SGXVBREMF1B37473F+s-T2PBAPGPPKXH7E8SDBQ6|1685583568132&t:1685583568133&adb:adblk_no";

        if (!$content = trim($this->fileGetContents($url, $headers))) {
            return $this->requestUrl($url, $retry, $filePath);
        }

        if ($this->isTriggerCode($content)) {
            return $this->executeEvent(self::TRY_TRIGGER_CODE, $content, $url);
        }

        $this->_success_count = $this->_success_count + 1;
        $this->_success_total = $this->_success_count;
        $this->write($filePath, $content);

        if (($this->_success_count && $this->waitCount) && $this->_success_count % $this->waitCount == 0) {
            $this->fmt('当前程序触发了成功抓取等待时间', '将于：' . date('Y-m-d H:i:s', time() + $this->waitTime) . '结束');
            sleep($this->waitTime);
        }

        return $content;
    }

    public function fileGetContents($url, array $headers)
    {
        $https = [
            'method' => "GET",
            'header' => implode("\r\n", $headers)
        ];

        if ($this->proxy) {
            $https['proxy'] = $this->proxy;
            $https['request_fulluri'] = true;
        }

        $opts['https'] = $https;
        $context = stream_context_create($opts);
        try {
            return file_get_contents($url, false, $context);
        } catch (\Exception $exception) {
            $this->fmt("请求错误：" . $exception->getMessage());
            return false;
        }
    }

    public function tryTriggerCode($ow, $content, $url)
    {
        sleep($this->waitTime);
        return null;
    }

    public function executeEvent($name, ...$params)
    {
        array_unshift($params, $this);
        if (!isset($this->_event[$name])) {
            return;
        }

        return call_user_func_array($this->_event[$name], $params);
    }

    private function fmt()
    {
        $data = func_get_args();
        $msg = implode(';', $data);
        echo date('Y-m-d H:i:s'), $msg, "\n";
    }

    private function parseUrl($url)
    {
        return trim(urldecode(html_entity_decode($url)));
    }

    //验证是否触发验证程序
    private function isTriggerCode($content)
    {
        $preg = "/并非自动程序/i";
        return preg_match($preg, $content);
    }

    private function getLocalFile($filePath)
    {
        $content = '';
        if ($size = filesize($filePath)) {
            $file = fopen($filePath, 'r');
            $content = fread($file, $size);
            fclose($file);
        }

        return $content;
    }

    private function getTmpFilePath($url)
    {
        return $this->getFilePath() . md5($url) . '.html';
    }

    private function write($filePath, $content)
    {
        $path = dirname($filePath);
        if (!file_exists($path)) {
            mkdir($path, 0775, true);
        }

        $file = fopen($filePath, 'w');
        fwrite($file, $content);
        fclose($file);
    }

    private function getFilePath()
    {
        return $this->filePath . DIRECTORY_SEPARATOR . 'tmp' . DIRECTORY_SEPARATOR;
    }

    protected function __init()
    {
        $this->_event = [
            self::IMAGE_DOWNLOAD => [$this, 'downloadImage'],
            self::TRY_TRIGGER_CODE => [$this, '[tryTriggerCode'],
        ];

        if (!class_exists('phpQuery')) {
            throw new Exception('call you composer require electrolinux/phpquery');
        }
    }

    private function downloadImage($ow, $url, $realName)
    {
        $ch = curl_init($url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_BINARYTRANSFER, 1);
        $imageData = curl_exec($ch);
        $statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        if ($statusCode == 200) {
            $finfo = new \finfo(FILEINFO_MIME_TYPE);
            $type = $finfo->buffer($imageData);
            $extension = '';
            switch ($type) {
                case 'image/jpeg':
                    $extension = '.jpg';
                    break;
                case 'image/png':
                    $extension = '.png';
                    break;
                case 'image/gif':
                    $extension = '.gif';
                    break;
                default:
                    break;
            }

            if ($extension) {
                $savePath = $this->imgPath . DIRECTORY_SEPARATOR . date('Ymd')
                    . DIRECTORY_SEPARATOR . md5($url) . $extension;

                $this->write($savePath, $imageData);
                return $savePath;
            }
        }

        return $url;
    }

    private function removeFiles($paths)
    {
        $paths = is_array($paths) ? $paths : [$paths];
        foreach ($paths as $path) {
            unlink($path);
        }
    }

    private $waitCount;
    private $waitTime;
    private $retry;
    private $baseUrl;
    private $filePath;
    private $imgPath;
    private $pageWait = 1;

    private $_event = [

    ];

    private $_success_count = 0;
    private $_success_total = 0;
}
